-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[mlir][amdgpu] Add scaled_ext_packed{8,16} operations #159830
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[mlir][amdgpu] Add scaled_ext_packed{8,16} operations #159830
Conversation
@llvm/pr-subscribers-mlir-amdgpu @llvm/pr-subscribers-backend-amdgpu Author: Erick Ochoa Lopez (amd-eochoalo) ChangesFull diff: https://github.com/llvm/llvm-project/pull/159830.diff 2 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index a24a918357f2d..d5ea737e229ff 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,48 @@ def AMDGPU_ExtPackedFp8Op :
}];
}
+def AMDGPU_ScaledExtPacked8Op
+ : AMDGPU_Op<"scaled_ext_packed8", [Pure]>,
+ Arguments<(
+ ins VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>:$source,
+ F32:$scale,
+ ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[8], [F32]>,
+ FixedVectorOfLengthAndType<[8], [F16]>,
+ FixedVectorOfLengthAndType<[8], [BF16]>]>:$res)> {
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ Extend and scale eight packed floats in to eight floats and return them.
+ }];
+
+ let assemblyFormat = [{
+ attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res)
+ }];
+}
+
+def AMDGPU_ScaledExtPacked16Op
+ : AMDGPU_Op<"scaled_ext_packed16", [Pure]>,
+ Arguments<(
+ ins VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>:$source,
+ F32:$scale,
+ ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[16], [F32]>,
+ FixedVectorOfLengthAndType<[16], [F16]>,
+ FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> {
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ Extend and scale 16 packed floats to 16 floats and return them.
+ }];
+
+ let assemblyFormat = [{
+ attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res)
+ }];
+}
+
def AMDGPU_ScaledExtPackedOp
: AMDGPU_Op<"scaled_ext_packed", [Pure]>,
Arguments<(
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 369e0fff538e1..1841c0815b435 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -221,6 +221,61 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) ->
func.return %ret : vector<2xbf16>
}
+// CHECK-LABEL: func.func @scaled_ext_packed8_fp4
+func.func @scaled_ext_packed8_fp4(%v: vector<8xf4E2M1FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed8_fp8
+func.func @scaled_ext_packed8_fp8(%v: vector<8xf8E4M3FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed8_bf8
+func.func @scaled_ext_packed8_bf8(%v: vector<8xf8E5M2>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed16_fp6
+func.func @scaled_ext_packed16_fp6(%v: vector<16xf6E2M3FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed16_bf16
+func.func @scaled_ext_packed16_bf16(%v: vector<16xf6E3M2FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
|
@llvm/pr-subscribers-mlir Author: Erick Ochoa Lopez (amd-eochoalo) ChangesFull diff: https://github.com/llvm/llvm-project/pull/159830.diff 2 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index a24a918357f2d..d5ea737e229ff 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,48 @@ def AMDGPU_ExtPackedFp8Op :
}];
}
+def AMDGPU_ScaledExtPacked8Op
+ : AMDGPU_Op<"scaled_ext_packed8", [Pure]>,
+ Arguments<(
+ ins VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>:$source,
+ F32:$scale,
+ ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[8], [F32]>,
+ FixedVectorOfLengthAndType<[8], [F16]>,
+ FixedVectorOfLengthAndType<[8], [BF16]>]>:$res)> {
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ Extend and scale eight packed floats in to eight floats and return them.
+ }];
+
+ let assemblyFormat = [{
+ attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res)
+ }];
+}
+
+def AMDGPU_ScaledExtPacked16Op
+ : AMDGPU_Op<"scaled_ext_packed16", [Pure]>,
+ Arguments<(
+ ins VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>:$source,
+ F32:$scale,
+ ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+ Results<(
+ outs AnyTypeOf<[FixedVectorOfLengthAndType<[16], [F32]>,
+ FixedVectorOfLengthAndType<[16], [F16]>,
+ FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> {
+ let summary = "Extend a vector of packed floating point values";
+
+ let description = [{
+ Extend and scale 16 packed floats to 16 floats and return them.
+ }];
+
+ let assemblyFormat = [{
+ attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res)
+ }];
+}
+
def AMDGPU_ScaledExtPackedOp
: AMDGPU_Op<"scaled_ext_packed", [Pure]>,
Arguments<(
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 369e0fff538e1..1841c0815b435 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -221,6 +221,61 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) ->
func.return %ret : vector<2xbf16>
}
+// CHECK-LABEL: func.func @scaled_ext_packed8_fp4
+func.func @scaled_ext_packed8_fp4(%v: vector<8xf4E2M1FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed8_fp8
+func.func @scaled_ext_packed8_fp8(%v: vector<8xf8E4M3FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed8_bf8
+func.func @scaled_ext_packed8_bf8(%v: vector<8xf8E5M2>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16>
+ // CHECK: amdgpu.scaled_ext_packed8
+ %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32>
+ func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed16_fp6
+func.func @scaled_ext_packed16_fp6(%v: vector<16xf6E2M3FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed16_bf16
+func.func @scaled_ext_packed16_bf16(%v: vector<16xf6E3M2FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16>
+ // CHECK: amdgpu.scaled_ext_packed16
+ %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32>
+ func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
// CHECK: amdgpu.packed_scaled_trunc
func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
|
}]; | ||
} | ||
|
||
def AMDGPU_ScaledExtPacked16Op |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't make distinct operations here. Instead, loosen the definition of scaled_ext_packed
and add checks for chip compatibility to the lowering.
If that's not feasible, get back to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did it here c3832b0 . Is the assembly format acceptable for you?
…ops" This reverts commit c3832b0.
On further offline discussion, I'll need more context / we'll want to see if two separate ops are actually the better design here. |
between f92db34 and c3832b0 I prefer f92db34. By having two optional attributes which are in an XOR-relationship the constructors for this operation will always require a nullptr and getting the attributes will always get a Between f92db34 and merging these two operations into their distinct operation, one thing to notice is that we will need a verifier to make sure the types are correctly matched. (Not a big deal). We also need to choose a name for this operation since |
No description provided.